import numpy as np
import pandas as pd # data processing, CSV file
import matplotlib.pyplot as plt
import seaborn as sns
import os
df = pd.read_csv("segmented_customers.csv")
df.head()
| CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | cluster | |
|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 19 | 15 | 39 | 3 |
| 1 | 2 | 1 | 21 | 15 | 81 | 4 |
| 2 | 3 | 0 | 20 | 16 | 6 | 3 |
| 3 | 4 | 0 | 23 | 16 | 77 | 4 |
| 4 | 5 | 0 | 31 | 17 | 40 | 3 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 200 entries, 0 to 199 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 200 non-null int64 1 Gender 200 non-null int64 2 Age 200 non-null int64 3 Annual Income (k$) 200 non-null int64 4 Spending Score (1-100) 200 non-null int64 5 cluster 200 non-null int64 dtypes: int64(6) memory usage: 9.5 KB
df.describe()
| CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | cluster | |
|---|---|---|---|---|---|---|
| count | 200.000000 | 200.000000 | 200.000000 | 200.000000 | 200.000000 | 200.000000 |
| mean | 100.500000 | 0.440000 | 38.850000 | 60.560000 | 50.200000 | 1.760000 |
| std | 57.879185 | 0.497633 | 13.969007 | 26.264721 | 25.823522 | 1.191427 |
| min | 1.000000 | 0.000000 | 18.000000 | 15.000000 | 1.000000 | 0.000000 |
| 25% | 50.750000 | 0.000000 | 28.750000 | 41.500000 | 34.750000 | 1.000000 |
| 50% | 100.500000 | 0.000000 | 36.000000 | 61.500000 | 50.000000 | 2.000000 |
| 75% | 150.250000 | 1.000000 | 49.000000 | 78.000000 | 73.000000 | 2.000000 |
| max | 200.000000 | 1.000000 | 70.000000 | 137.000000 | 99.000000 | 4.000000 |
df.head()
| CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | cluster | |
|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 19 | 15 | 39 | 3 |
| 1 | 2 | 1 | 21 | 15 | 81 | 4 |
| 2 | 3 | 0 | 20 | 16 | 6 | 3 |
| 3 | 4 | 0 | 23 | 16 | 77 | 4 |
| 4 | 5 | 0 | 31 | 17 | 40 | 3 |
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x1bf06268e50>
df=df.drop(['CustomerID'],axis=1)
df.head()
| Gender | Age | Annual Income (k$) | Spending Score (1-100) | cluster | |
|---|---|---|---|---|---|
| 0 | 1 | 19 | 15 | 39 | 3 |
| 1 | 1 | 21 | 15 | 81 | 4 |
| 2 | 0 | 20 | 16 | 6 | 3 |
| 3 | 0 | 23 | 16 | 77 | 4 |
| 4 | 0 | 31 | 17 | 40 | 3 |
sns.heatmap(df.corr())
<AxesSubplot:>
plt.figure(figsize=(7,7))
size=df['Gender'].value_counts()
label=['Female','Male']
color=['purple','Blue']
explode=[0,0.1]
plt.pie(size,explode=explode,labels=label,colors=color,shadow=True)
plt.legend()
plt.show()
plt.figure(figsize=(10,5))
sns.countplot(df['Age'])
plt.xticks(rotation=90)
c:\users\user\appdata\local\programs\python\python38\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]),
[Text(0, 0, '18'),
Text(1, 0, '19'),
Text(2, 0, '20'),
Text(3, 0, '21'),
Text(4, 0, '22'),
Text(5, 0, '23'),
Text(6, 0, '24'),
Text(7, 0, '25'),
Text(8, 0, '26'),
Text(9, 0, '27'),
Text(10, 0, '28'),
Text(11, 0, '29'),
Text(12, 0, '30'),
Text(13, 0, '31'),
Text(14, 0, '32'),
Text(15, 0, '33'),
Text(16, 0, '34'),
Text(17, 0, '35'),
Text(18, 0, '36'),
Text(19, 0, '37'),
Text(20, 0, '38'),
Text(21, 0, '39'),
Text(22, 0, '40'),
Text(23, 0, '41'),
Text(24, 0, '42'),
Text(25, 0, '43'),
Text(26, 0, '44'),
Text(27, 0, '45'),
Text(28, 0, '46'),
Text(29, 0, '47'),
Text(30, 0, '48'),
Text(31, 0, '49'),
Text(32, 0, '50'),
Text(33, 0, '51'),
Text(34, 0, '52'),
Text(35, 0, '53'),
Text(36, 0, '54'),
Text(37, 0, '55'),
Text(38, 0, '56'),
Text(39, 0, '57'),
Text(40, 0, '58'),
Text(41, 0, '59'),
Text(42, 0, '60'),
Text(43, 0, '63'),
Text(44, 0, '64'),
Text(45, 0, '65'),
Text(46, 0, '66'),
Text(47, 0, '67'),
Text(48, 0, '68'),
Text(49, 0, '69'),
Text(50, 0, '70')])
sns.boxplot(df['Gender'],df['Annual Income (k$)'])
c:\users\user\appdata\local\programs\python\python38\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='Gender', ylabel='Annual Income (k$)'>
plt.figure(figsize=(15,5))
sns.countplot(df['Annual Income (k$)'])
c:\users\user\appdata\local\programs\python\python38\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='Annual Income (k$)', ylabel='count'>
plt.bar(df['Annual Income (k$)'],df['Spending Score (1-100)'])
plt.title('Spending Score (1-100) over Annual Income (k$))',fontsize=20)
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
Text(0, 0.5, 'Spending Score (1-100)')
df.head()
| Gender | Age | Annual Income (k$) | Spending Score (1-100) | cluster | |
|---|---|---|---|---|---|
| 0 | 1 | 19 | 15 | 39 | 3 |
| 1 | 1 | 21 | 15 | 81 | 4 |
| 2 | 0 | 20 | 16 | 6 | 3 |
| 3 | 0 | 23 | 16 | 77 | 4 |
| 4 | 0 | 31 | 17 | 40 | 3 |
x=df.iloc[:,[1,3]].values
x.shape
(200, 2)
from sklearn.cluster import DBSCAN
db=DBSCAN(eps=1,min_samples=2,metric='euclidean')
model=db.fit(x)
label=model.labels_
label
array([-1, 0, 1, 2, -1, -1, -1, -1, -1, 3, -1, 4, 5, 2, -1, -1, 6,
-1, -1, 4, 6, -1, -1, 3, -1, 7, -1, -1, -1, -1, -1, -1, -1, -1,
-1, 0, 8, 3, 9, -1, -1, -1, 10, -1, -1, -1, -1, -1, -1, 11, -1,
12, -1, -1, 13, 14, -1, -1, 15, 16, 17, 18, -1, -1, -1, 19, -1, -1,
19, -1, 17, 14, -1, -1, -1, 20, -1, -1, 21, 14, -1, -1, -1, -1, -1,
-1, -1, -1, 12, 13, -1, -1, -1, -1, 11, 21, -1, 15, 14, -1, -1, -1,
-1, 20, -1, -1, -1, 16, -1, -1, -1, 18, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, 22, -1, -1, -1, -1, -1, 23, -1, 23, -1, -1, 1, 24,
-1, 3, -1, -1, -1, 25, -1, 26, -1, -1, 10, 3, 27, 28, 8, -1, -1,
23, 29, -1, -1, -1, -1, 3, -1, 7, 1, 25, -1, -1, -1, -1, 9, -1,
-1, -1, -1, -1, -1, -1, 5, -1, -1, 28, -1, 26, -1, 24, -1, -1, -1,
-1, 8, -1, 27, -1, -1, 22, 29, -1, -1, 3, -1, 7], dtype=int64)
from sklearn import metrics
#identifying the points which makes up our core points
sample_cores=np.zeros_like(label,dtype=bool)
sample_cores[db.core_sample_indices_]=True
#Calculating the number of clusters
n_clusters=len(set(label))- (1 if -1 in label else 0)
print('No of clusters:',n_clusters)
No of clusters: 30
y_means = db.fit_predict(x)
plt.figure(figsize=(10,10))
plt.scatter(x[y_means == 0, 0], x[y_means == 0, 1], s = 50, c = 'black')
plt.scatter(x[y_means == 1, 0], x[y_means == 1, 1], s = 50, c = 'violet')
plt.scatter(x[y_means == 2, 0], x[y_means == 2, 1], s = 50, c = 'cyan')
plt.scatter(x[y_means == 3, 0], x[y_means == 3, 1], s = 50, c = 'red')
plt.scatter(x[y_means == 4, 0], x[y_means == 4, 1], s = 50, c = 'orange')
plt.scatter(x[y_means == 5, 0], x[y_means == 5, 1], s = 50, c = 'blue')
plt.scatter(x[y_means == 6, 0], x[y_means == 6, 1], s = 50, c = 'green')
plt.scatter(x[y_means == 7, 0], x[y_means == 7, 1], s = 50, c = 'purple')
plt.scatter(x[y_means == 8, 0], x[y_means == 8, 1], s = 50, c = 'yellow')
plt.scatter(x[y_means == 9, 0], x[y_means == 9, 1], s = 50, c = '#88c999')
plt.scatter(x[y_means == 10, 0], x[y_means == 10, 1], s = 50, c = '#ffd24d')
plt.xlabel('Overall 1-100')
plt.ylabel('Wages 1-100')
plt.title('Clusters of data')
plt.show()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
import plotly.graph_objs as go
import warnings
warnings.filterwarnings('ignore')
from sklearn import preprocessing
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
df = pd.read_csv('segmented_customers.csv')
df.head()
| CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | cluster | |
|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 19 | 15 | 39 | 3 |
| 1 | 2 | 1 | 21 | 15 | 81 | 4 |
| 2 | 3 | 0 | 20 | 16 | 6 | 3 |
| 3 | 4 | 0 | 23 | 16 | 77 | 4 |
| 4 | 5 | 0 | 31 | 17 | 40 | 3 |
df.isnull().sum()
CustomerID 0 Gender 0 Age 0 Annual Income (k$) 0 Spending Score (1-100) 0 cluster 0 dtype: int64
df.describe()
| CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | cluster | |
|---|---|---|---|---|---|---|
| count | 200.000000 | 200.000000 | 200.000000 | 200.000000 | 200.000000 | 200.000000 |
| mean | 100.500000 | 0.440000 | 38.850000 | 60.560000 | 50.200000 | 1.760000 |
| std | 57.879185 | 0.497633 | 13.969007 | 26.264721 | 25.823522 | 1.191427 |
| min | 1.000000 | 0.000000 | 18.000000 | 15.000000 | 1.000000 | 0.000000 |
| 25% | 50.750000 | 0.000000 | 28.750000 | 41.500000 | 34.750000 | 1.000000 |
| 50% | 100.500000 | 0.000000 | 36.000000 | 61.500000 | 50.000000 | 2.000000 |
| 75% | 150.250000 | 1.000000 | 49.000000 | 78.000000 | 73.000000 | 2.000000 |
| max | 200.000000 | 1.000000 | 70.000000 | 137.000000 | 99.000000 | 4.000000 |
plt.figure(1 , figsize = (15 , 6))
n = 0
for x in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']:
n += 1
plt.subplot(1 , 3 , n)
plt.subplots_adjust(hspace = 0.5 , wspace = 0.5)
sns.distplot(df[x] , bins = 15)
plt.title('Distplot of {}'.format(x))
plt.show()
label_encoder = preprocessing.LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df.head()
| CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | cluster | |
|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 19 | 15 | 39 | 3 |
| 1 | 2 | 1 | 21 | 15 | 81 | 4 |
| 2 | 3 | 0 | 20 | 16 | 6 | 3 |
| 3 | 4 | 0 | 23 | 16 | 77 | 4 |
| 4 | 5 | 0 | 31 | 17 | 40 | 3 |
plt.figure(1, figsize = (16 ,8))
sns.heatmap(df)
plt.show()
plt.figure(1, figsize = (16 ,8))
dendrogram = sch.dendrogram(sch.linkage(df, method = "ward"))
plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean distances')
plt.show()
hc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage ='average')
y_hc = hc.fit_predict(df)
y_hc
array([3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4,
3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 2,
3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0, 1, 2, 1, 0, 1, 0, 1,
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
0, 1], dtype=int64)
df['cluster'] = pd.DataFrame(y_hc)
trace1 = go.Scatter3d(
x= df['Age'],
y= df['Spending Score (1-100)'],
z= df['Annual Income (k$)'],
mode='markers',
marker=dict(
color = df['cluster'],
size= 10,
line=dict(
color= df['cluster'],
width= 12
),
opacity=0.8
)
)
data = [trace1]
layout = go.Layout(
title= 'Clusters using Agglomerative Clustering',
scene = dict(
xaxis = dict(title = 'Age'),
yaxis = dict(title = 'Spending Score'),
zaxis = dict(title = 'Annual Income')
)
)
fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)
X = df.iloc[:, [3,4]].values
plt.scatter(X[y_hc==0, 0], X[y_hc==0, 1], s=100, c='red', label ='Cluster 1')
plt.scatter(X[y_hc==1, 0], X[y_hc==1, 1], s=100, c='blue', label ='Cluster 2')
plt.scatter(X[y_hc==2, 0], X[y_hc==2, 1], s=100, c='green', label ='Cluster 3')
plt.scatter(X[y_hc==3, 0], X[y_hc==3, 1], s=100, c='purple', label ='Cluster 4')
plt.scatter(X[y_hc==4, 0], X[y_hc==4, 1], s=100, c='orange', label ='Cluster 5')
plt.title('Clusters of Customers (Hierarchical Clustering Model)')
plt.xlabel('Annual Income(k$)')
plt.ylabel('Spending Score(1-100)')
plt.show()
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt
%matplotlib inline
df = pd.read_csv('segmented_customers.csv')
df.head()
| CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | cluster | |
|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 19 | 15 | 39 | 3 |
| 1 | 2 | 1 | 21 | 15 | 81 | 4 |
| 2 | 3 | 0 | 20 | 16 | 6 | 3 |
| 3 | 4 | 0 | 23 | 16 | 77 | 4 |
| 4 | 5 | 0 | 31 | 17 | 40 | 3 |
plt.scatter(df['Age'], df['Annual Income (k$)'])
<matplotlib.collections.PathCollection at 0x1bf0c31f460>
km = KMeans(n_clusters=3)
km
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
random_state=None, tol=0.0001, verbose=0)
y_predicted = km.fit_predict(df[['Age', 'Annual Income (k$)']])
y_predicted
array([2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 2, 0, 2,
0, 2, 0, 2, 2, 2, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 2,
0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2,
0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1])
df['cluster'] = y_predicted
df.head()
| CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | cluster | |
|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 19 | 15 | 39 | 2 |
| 1 | 2 | 1 | 21 | 15 | 81 | 2 |
| 2 | 3 | 0 | 20 | 16 | 6 | 2 |
| 3 | 4 | 0 | 23 | 16 | 77 | 2 |
| 4 | 5 | 0 | 31 | 17 | 40 | 2 |
km.cluster_centers_
array([[55.03448276, 50.34482759],
[34.34782609, 82.82608696],
[28.36 , 31.44 ]])
df1 = df[df.cluster==0]
df2 = df[df.cluster==1]
df3 = df[df.cluster==2]
plt.scatter(df1.Age,df1['Annual Income (k$)'],color='green')
plt.scatter(df2.Age,df2['Annual Income (k$)'],color='red')
plt.scatter(df3.Age,df3['Annual Income (k$)'],color='black')
plt.scatter(km.cluster_centers_[:,0],km.cluster_centers_[:,1],color='purple',marker='*',label='centroid')
plt.xlabel('Age')
plt.ylabel('Annual Income (k$)')
plt.legend()
<matplotlib.legend.Legend at 0x1bf0c3a2370>
scaler = MinMaxScaler()
scaler.fit(df[['Annual Income (k$)']])
df['Annual Income (k$)'] = scaler.transform(df[['Annual Income (k$)']])
scaler.fit(df[['Age']])
df['Age'] = scaler.transform(df[['Age']])
df.head()
df
| CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | cluster | |
|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 0.019231 | 0.000000 | 39 | 2 |
| 1 | 2 | 1 | 0.057692 | 0.000000 | 81 | 2 |
| 2 | 3 | 0 | 0.038462 | 0.008197 | 6 | 2 |
| 3 | 4 | 0 | 0.096154 | 0.008197 | 77 | 2 |
| 4 | 5 | 0 | 0.250000 | 0.016393 | 40 | 2 |
| ... | ... | ... | ... | ... | ... | ... |
| 195 | 196 | 0 | 0.326923 | 0.860656 | 79 | 1 |
| 196 | 197 | 0 | 0.519231 | 0.909836 | 28 | 1 |
| 197 | 198 | 1 | 0.269231 | 0.909836 | 74 | 1 |
| 198 | 199 | 1 | 0.269231 | 1.000000 | 18 | 1 |
| 199 | 200 | 1 | 0.230769 | 1.000000 | 83 | 1 |
200 rows × 6 columns
km = KMeans(n_clusters=3)
y_predicted = km.fit_predict(df[['Age','Annual Income (k$)']])
y_predicted
array([0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0,
2, 0, 2, 0, 2, 0, 0, 0, 2, 0, 2, 0, 2, 0, 2, 0, 0, 0, 2, 0, 2, 0,
2, 0, 2, 0, 0, 0, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 0,
2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 1, 0, 2, 2, 0, 2, 2, 0, 2, 2, 0,
0, 2, 2, 0, 2, 1, 0, 0, 2, 0, 2, 0, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2,
2, 0, 1, 0, 0, 0, 2, 2, 2, 2, 0, 1, 1, 1, 0, 1, 1, 1, 2, 1, 2, 1,
1, 1, 0, 1, 1, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,
2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1])
df['cluster']=y_predicted
df.head()
| CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | cluster | |
|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 0.019231 | 0.000000 | 39 | 0 |
| 1 | 2 | 1 | 0.057692 | 0.000000 | 81 | 0 |
| 2 | 3 | 0 | 0.038462 | 0.008197 | 6 | 0 |
| 3 | 4 | 0 | 0.096154 | 0.008197 | 77 | 0 |
| 4 | 5 | 0 | 0.250000 | 0.016393 | 40 | 0 |
km.cluster_centers_
array([[0.15264423, 0.21016906],
[0.33653846, 0.58435792],
[0.72175481, 0.29943648]])
df1 = df[df.cluster==0]
df2 = df[df.cluster==1]
df3 = df[df.cluster==2]
plt.scatter(df1.Age,df1['Annual Income (k$)'],color='green')
plt.scatter(df2.Age,df2['Annual Income (k$)'],color='red')
plt.scatter(df3.Age,df3['Annual Income (k$)'],color='black')
plt.scatter(km.cluster_centers_[:,0],km.cluster_centers_[:,1],color='purple',marker='*',label='centroid')
plt.legend()
<matplotlib.legend.Legend at 0x1bf0c4261c0>
sse = []
k_rng = range(1,10)
for k in k_rng:
km = KMeans(n_clusters=k)
km.fit(df[['Age','Annual Income (k$)']])
sse.append(km.inertia_)
plt.xlabel('K')
plt.ylabel('Sum of squared error')
plt.plot(k_rng,sse)
[<matplotlib.lines.Line2D at 0x1bf0c732b20>]